library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(d3heatmap)
library(ggplot2)

C <- read_csv("Group9/brazilian-ecommerce/olist_customers_dataset.csv")
## Parsed with column specification:
## cols(
##   customer_id = col_character(),
##   customer_unique_id = col_character(),
##   customer_zip_code_prefix = col_character(),
##   customer_city = col_character(),
##   customer_state = col_character()
## )
Lo <- read_csv("Group9/brazilian-ecommerce/olist_geolocation_dataset.csv")
## Parsed with column specification:
## cols(
##   geolocation_zip_code_prefix = col_character(),
##   geolocation_lat = col_double(),
##   geolocation_lng = col_double(),
##   geolocation_city = col_character(),
##   geolocation_state = col_character()
## )
It <- read_csv("Group9/brazilian-ecommerce/olist_order_items_dataset.csv")
## Parsed with column specification:
## cols(
##   order_id = col_character(),
##   order_item_id = col_integer(),
##   product_id = col_character(),
##   seller_id = col_character(),
##   shipping_limit_date = col_datetime(format = ""),
##   price = col_double(),
##   freight_value = col_double()
## )
pay <- read_csv("Group9/brazilian-ecommerce/olist_order_payments_dataset.csv")
## Parsed with column specification:
## cols(
##   order_id = col_character(),
##   payment_sequential = col_integer(),
##   payment_type = col_character(),
##   payment_installments = col_integer(),
##   payment_value = col_double()
## )
re <- read_csv("Group9/brazilian-ecommerce/olist_order_reviews_dataset.csv")
## Parsed with column specification:
## cols(
##   review_id = col_character(),
##   order_id = col_character(),
##   review_score = col_integer(),
##   review_comment_title = col_character(),
##   review_comment_message = col_character(),
##   review_creation_date = col_datetime(format = ""),
##   review_answer_timestamp = col_datetime(format = "")
## )
order <- read_csv("Group9/brazilian-ecommerce/olist_orders_dataset.csv")
## Parsed with column specification:
## cols(
##   order_id = col_character(),
##   customer_id = col_character(),
##   order_status = col_character(),
##   order_purchase_timestamp = col_datetime(format = ""),
##   order_approved_at = col_datetime(format = ""),
##   order_delivered_carrier_date = col_datetime(format = ""),
##   order_delivered_customer_date = col_datetime(format = ""),
##   order_estimated_delivery_date = col_datetime(format = "")
## )
prod <- read_csv("Group9/brazilian-ecommerce/olist_products_dataset.csv")
## Parsed with column specification:
## cols(
##   product_id = col_character(),
##   product_category_name = col_character(),
##   product_name_lenght = col_integer(),
##   product_description_lenght = col_integer(),
##   product_photos_qty = col_integer(),
##   product_weight_g = col_integer(),
##   product_length_cm = col_integer(),
##   product_height_cm = col_integer(),
##   product_width_cm = col_integer()
## )
sell <- read_csv("Group9/brazilian-ecommerce/olist_sellers_dataset.csv")
## Parsed with column specification:
## cols(
##   seller_id = col_character(),
##   seller_zip_code_prefix = col_character(),
##   seller_city = col_character(),
##   seller_state = col_character()
## )
categ <- read_csv("Group9/brazilian-ecommerce/product_category_name_translation.csv")
## Parsed with column specification:
## cols(
##   product_category_name = col_character(),
##   product_category_name_english = col_character()
## )
unique(order$order_id) %>% length()    ### 99441 different order
## [1] 99441
unique(order$customer_id) %>% length()  ### 99441 customer ID
## [1] 99441
duplicated(order$customer_id) %>% table()  ### false 99441
## .
## FALSE 
## 99441
order$order_id %in% re$order_id %>% table() ### TRUE
## .
##  TRUE 
## 99441
###

re$review_id %>% duplicated() %>% table() ### TRUE 827, FALSE 99173
## .
## FALSE  TRUE 
## 99173   827
re$order_id %>% duplicated() %>% table() ### TRUE 559, FALSE 99441
## .
## FALSE  TRUE 
## 99441   559
unique(re$order_id) %>% length() ### 99441
## [1] 99441
setdiff(unique(re$order_id),unique(order$order_id)) ### character(0)
## character(0)
duplicated(C[,c(1,2)]) %>% table()
## .
## FALSE 
## 99441
###

order$customer_id %>% unique() %>% length() ### 99411 
## [1] 99441
C$customer_unique_id %>% unique() %>% length() ### 96096
## [1] 96096
C$customer_unique_id %>% table %>% table()
## .
##     1     2     3     4     5     6     7     9    17 
## 93099  2745   203    30     8     6     3     1     1
#     1     2     3     4     5     6     7     9    17 
# 93099  2745   203    30     8     6     3     1     1 

C[,c(4, 5)] <- lapply(C[,c(4, 5)], factor) 
summary(C)
##  customer_id        customer_unique_id customer_zip_code_prefix
##  Length:99441       Length:99441       Length:99441            
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##                                                                
##         customer_city   customer_state 
##  sao paulo     :15540   SP     :41746  
##  rio de janeiro: 6882   RJ     :12852  
##  belo horizonte: 2773   MG     :11635  
##  brasilia      : 2131   RS     : 5466  
##  curitiba      : 1521   PR     : 5045  
##  campinas      : 1444   SC     : 3637  
##  (Other)       :69150   (Other):19060
C$customer_city %>% table() %>% sort %>% tail(10)
## .
## sao bernardo do campo             guarulhos              salvador 
##                   938                  1189                  1245 
##          porto alegre              campinas              curitiba 
##                  1379                  1444                  1521 
##              brasilia        belo horizonte        rio de janeiro 
##                  2131                  2773                  6882 
##             sao paulo 
##                 15540
# sao bernardo do campo             guarulhos              salvador          porto alegre              campinas 
#                  938                  1189                  1245                  1379                  1444 
# curitiba              brasilia        belo horizonte        rio de janeiro             sao paulo 
#     1521                  2131                  2773                  6882                 15540 

### 

setdiff(unique(pay$order_id),unique(order$order_id)) ### character(0)
## character(0)
setdiff(unique(order$order_id),unique(pay$order_id)) ### "bfbd0f9bdef84302105ad712db648a6c"
## [1] "bfbd0f9bdef84302105ad712db648a6c"
pay$payment_type <- as.factor(pay$payment_type)
unique(pay$order_id) %>% length() ### 99440
## [1] 99440
pay$order_id %>% table() %>% table() 
## .
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 96479  2382   301   108    52    36    28    11     9     5     8     8 
##    13    14    15    19    21    22    26    29 
##     3     2     2     2     1     1     1     1
pay$payment_sequential %>% table() %>% sum() ### 103886
## [1] 103886
#     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    19    21    22    26    29 
# 96479  2382   301   108    52    36    28    11     9     5     8     8     3     2     2     2     1     1     1     1 

pay[pay$payment_sequential == 2,]$order_id %in% pay[pay$payment_sequential == 1,]$order_id %>% table()
## .
## FALSE  TRUE 
##    80  2959
# FALSE  TRUE 
#    80  2959 

summary(pay)
##    order_id         payment_sequential      payment_type  
##  Length:103886      Min.   : 1.000     boleto     :19784  
##  Class :character   1st Qu.: 1.000     credit_card:76795  
##  Mode  :character   Median : 1.000     debit_card : 1529  
##                     Mean   : 1.093     not_defined:    3  
##                     3rd Qu.: 1.000     voucher    : 5775  
##                     Max.   :29.000                        
##  payment_installments payment_value     
##  Min.   : 0.000       Min.   :    0.00  
##  1st Qu.: 1.000       1st Qu.:   56.79  
##  Median : 1.000       Median :  100.00  
##  Mean   : 2.853       Mean   :  154.10  
##  3rd Qu.: 4.000       3rd Qu.:  171.84  
##  Max.   :24.000       Max.   :13664.08
# order_id         payment_sequential      payment_type   payment_installments   payment_value  
# Length:103886      Min.   : 1.00      boleto     :19784   Min.   : 0.00        Min.   :    0  
# Class :character   1st Qu.: 1.00      credit_card:76795   1st Qu.: 1.00        1st Qu.:   57  
# Mode  :character   Median : 1.00      debit_card : 1529   Median : 1.00        Median :  100  
#                    Mean   : 1.09      not_defined:    3   Mean   : 2.85        Mean   :  154  
#                    3rd Qu.: 1.00      voucher    : 5775   3rd Qu.: 4.00        3rd Qu.:  172  
#                    Max.   :29.00                          Max.   :24.00        Max.   :13664  
# 

### It

It$order_id %>% unique() %>% length() ### 98666 
## [1] 98666
It$order_id %in% order$order_id %>% table() ### TRUE
## .
##   TRUE 
## 112650
order$order_id %in% It$order_id %>% table() ### TRUE 98666 , FALSE 775
## .
## FALSE  TRUE 
##   775 98666
It$order_id %>% table() %>% table() 
## .
##     1     2     3     4     5     6     7     8     9    10    11    12 
## 88863  7516  1322   505   204   198    22     8     3     8     4     5 
##    13    14    15    20    21 
##     1     2     2     2     1
#     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    20    21 
# 88863  7516  1322   505   204   198    22     8     3     8     4     5     1     2     2     2     1 

It[It$order_item_id == 2,]$order_id %in% It[It$order_item_id == 1,]$order_id %>% table() ### TRUE
## .
## TRUE 
## 9803
# TRUE 
# 9803 

It$product_id %>% unique() %>% length() ### 32951
## [1] 32951
It$seller_id %>% unique() %>% length() ### 3095
## [1] 3095
### 

prod$product_id %>% unique() %>% length() ### 32951
## [1] 32951
setdiff(unique(It$product_id), unique(prod$product_id)) ### character(0)
## character(0)
prod$product_category_name %>% unique() %>% length() ### 74
## [1] 74
###
categ$product_category_name %>% unique() %>% length() ### 71
## [1] 71
setdiff(unique(prod$product_category_name), unique(categ$product_category_name))
## [1] NA                                             
## [2] "pc_gamer"                                     
## [3] "portateis_cozinha_e_preparadores_de_alimentos"
### NA , "pc_gamer" , "portateis_cozinha_e_preparadores_de_alimentos"


###


Z = It %>% group_by(seller_id) %>% summarise(
  ItemsSold = n(),
  Rev = sum(price),
  noProd = n_distinct(product_id),
  avgPrice = mean(price),
  maxPrice = max(price),
  minPrice = min(price),
  avgFreight = mean(freight_value),
  avgRevProd = Rev/noProd,
  avgItemsProd = ItemsSold/noProd
)
summary(Z)
##   seller_id           ItemsSold           Rev               noProd      
##  Length:3095        Min.   :   1.0   Min.   :     3.5   Min.   :  1.00  
##  Class :character   1st Qu.:   2.0   1st Qu.:   208.8   1st Qu.:  2.00  
##  Mode  :character   Median :   8.0   Median :   821.5   Median :  4.00  
##                     Mean   :  36.4   Mean   :  4391.5   Mean   : 11.13  
##                     3rd Qu.:  24.0   3rd Qu.:  3280.8   3rd Qu.: 10.00  
##                     Max.   :2033.0   Max.   :229472.6   Max.   :399.00  
##     avgPrice          maxPrice         minPrice         avgFreight    
##  Min.   :   3.50   Min.   :   3.5   Min.   :   0.85   Min.   :  1.20  
##  1st Qu.:  52.18   1st Qu.:  81.0   1st Qu.:  21.64   1st Qu.: 14.74  
##  Median :  95.47   Median : 169.9   Median :  44.80   Median : 18.23  
##  Mean   : 176.32   Mean   : 335.4   Mean   : 110.39   Mean   : 23.38  
##  3rd Qu.: 173.99   3rd Qu.: 349.9   3rd Qu.:  99.00   3rd Qu.: 24.37  
##  Max.   :6729.00   Max.   :6735.0   Max.   :6729.00   Max.   :308.34  
##    avgRevProd       avgItemsProd    
##  Min.   :    3.5   Min.   :  1.000  
##  1st Qu.:   89.0   1st Qu.:  1.000  
##  Median :  179.8   Median :  1.667  
##  Mean   :  417.5   Mean   :  2.669  
##  3rd Qu.:  397.7   3rd Qu.:  2.726  
##  Max.   :16983.5   Max.   :128.333
###


X = unique(It[,c(1,4)]) %>% left_join(re[,2:3]) %>% 
  group_by(seller_id) %>% summarise(
    noReview = n(),
    avgScore = mean(review_score),
    minScore = min(review_score),
    maxScore = max(review_score)) 
## Joining, by = "order_id"
###

Z = Z %>% left_join(X) %>% left_join(sell)
## Joining, by = "seller_id"
## Joining, by = "seller_id"
pacman::p_load(dplyr, FactoMineR, factoextra)
ZZ <- Z[,c(1,2,3,4,5,8,9,10,11,12,15,16,17)]
library(readr)
library(dplyr)
library(d3heatmap)
library(ggplot2)
library(caTools)
## library(chorddiag)
set.seed(1234)
ZZ$grp = kmeans(scale(ZZ[,c(3,5,6,9,10)]),5)$cluster %>% factor
ZZ$grp %>% table()
## .
##    1    2    3    4    5 
##   23 1123 1526  138  285
# Z$group = factor(cutree(hc, k=4))
# 
# fviz_dend(
#   hc, k=8, show_labels=F, rect=T, rect_fill=T,
#   labels_track_height=0,
#   palette="ucscgb", rect_border="ucscgb")
pacman::p_load(dplyr, FactoMineR, factoextra)
pca = PCA(ZZ[,c(3,5,6,9,10)])

fviz_pca_ind(pca)

grp <- ZZ$grp

fviz_pca_biplot(
  pca, repel=T, col.var="black", 
  col.ind=grp, alpha.ind=0.6, pointshape=16)

# grp <- ZZ$grp
# 
# fviz_pca_biplot(
#   pca, repel=T, col.var="black", 
#   col.ind=grp, alpha.ind=0.6, pointshape=16, 
#   addEllipses = TRUE, ellipse.level = 0.6, mean.point = FALSE)
group_by(ZZ, grp) %>% summarise(
  REV=mean(Rev), 
  avgPrice=mean(avgPrice), 
  avgFreight=mean(avgFreight),
  noReview=mean(noReview),
  avgScore=mean(avgScore),
  size=n() )
## # A tibble: 5 x 7
##   grp       REV avgPrice avgFreight noReview avgScore  size
##   <fct>   <dbl>    <dbl>      <dbl>    <dbl>    <dbl> <int>
## 1 1     128544.     162.       20.9   973.       4.07    23
## 2 2       6988.     152.       22.3    54.5      3.78  1123
## 3 3        995.     111.       19.1     9.78     4.58  1526
## 4 4       8221.    1148.       83.2     8.67     4.03   138
## 5 5        475.     152.       22.1     2.94     1.59   285
### 價格高的顧客願意等???
ZZZ<- ZZ[,c(3,5,6,9,10)]
colMeans(ZZZ)
##         Rev    avgPrice  avgFreight    noReview    avgScore 
## 4391.484233  176.325102   23.380116   32.497577    3.982833
ZA<- scale(ZZZ) %>% data.frame

par(cex=0.8)
split(ZA, grp) %>% sapply(colMeans) %>% barplot(beside=T,col=rainbow(5))
legend('topright',legend=colnames(ZZZ),fill=rainbow(5))

load("~/Downloads/II.RData")
II2 <- II %>% filter(grp== 1|grp == 4| grp== 2)
II2$grp <- II2$grp %>% as.numeric() %>% as.factor()
table(II2$grp, II2$category) %>% as.data.frame.matrix %>% 
  d3heatmap(F,F,col=cm.colors(13)[3:13])
table(II$category)
## 
##              agro_industry_and_commerce 
##                                      14 
##                        air_conditioning 
##                                      19 
##                                     art 
##                                      12 
##                   arts_and_craftmanship 
##                                       6 
##                                   audio 
##                                       5 
##                                    auto 
##                                     227 
##                                    baby 
##                                      92 
##                          bed_bath_table 
##                                     124 
##                  books_general_interest 
##                                      37 
##                          books_imported 
##                                       2 
##                         books_technical 
##                                      18 
##                      christmas_supplies 
##                                       7 
##                              cine_photo 
##                                       2 
##                               computers 
##                                       6 
##                   computers_accessories 
##                                     158 
##                          consoles_games 
##                                      24 
##         construction_tools_construction 
##                                      59 
##               construction_tools_lights 
##                                      17 
##               construction_tools_safety 
##                                      15 
##                              cool_stuff 
##                                      83 
##                costruction_tools_garden 
##                                       9 
##                 costruction_tools_tools 
##                                       9 
##                     diapers_and_hygiene 
##                                       5 
##                                  drinks 
##                                      20 
##                            dvds_blu_ray 
##                                       3 
##                             electronics 
##                                      38 
##                  fashio_female_clothing 
##                                       3 
##                fashion_bags_accessories 
##                                      58 
##                   fashion_male_clothing 
##                                       4 
##                           fashion_shoes 
##                                       4 
##                           fashion_sport 
##                                       3 
##                 fashion_underwear_beach 
##                                       2 
##                         fixed_telephony 
##                                      13 
##                                 flowers 
##                                       1 
##                                    food 
##                                      12 
##                              food_drink 
##                                      16 
##                       furniture_bedroom 
##                                      12 
##                         furniture_decor 
##                                     208 
##                   furniture_living_room 
##                                      25 
##       furniture_mattress_and_upholstery 
##                                       1 
##                            garden_tools 
##                                     102 
##                           health_beauty 
##                                     282 
##                         home_appliances 
##                                      20 
##                       home_appliances_2 
##                                      17 
##                            home_confort 
##                                       3 
##                       home_construction 
##                                      22 
##                              housewares 
##                                     230 
##          industry_commerce_and_business 
##                                       9 
## kitchen_dining_laundry_garden_furniture 
##                                      22 
##                              la_cuisine 
##                                       1 
##                     luggage_accessories 
##                                      33 
##                            market_place 
##                                       8 
##                                   music 
##                                       4 
##                     musical_instruments 
##                                      38 
##                        office_furniture 
##                                      23 
##                          party_supplies 
##                                       5 
##                               perfumery 
##                                      92 
##                                pet_shop 
##                                      84 
##                  signaling_and_security 
##                                      14 
##                        small_appliances 
##                                      41 
##   small_appliances_home_oven_and_coffee 
##                                       5 
##                          sports_leisure 
##                                     272 
##                              stationery 
##                                      60 
##                  tablets_printing_image 
##                                       2 
##                               telephony 
##                                      55 
##                                    toys 
##                                     118 
##                           watches_gifts 
##                                      52
table(II$grp, II$category) %>% as.data.frame.matrix %>% 
  d3heatmap(F,F,col=cm.colors(13)[3:13])
## II %>% group_by(grp) 
II4 <- II %>% filter(grp == 3)

II4$category %>% table %>% sort(decreasing = TRUE) 
## .
##                          sports_leisure 
##                                      94 
##                                    auto 
##                                      90 
##                         furniture_decor 
##                                      89 
##                              housewares 
##                                      81 
##                           health_beauty 
##                                      78 
##                   computers_accessories 
##                                      60 
##                          bed_bath_table 
##                                      44 
##                                    toys 
##                                      33 
##                            garden_tools 
##                                      32 
##                                    baby 
##                                      29 
##                               perfumery 
##                                      28 
##                               telephony 
##                                      26 
##                              cool_stuff 
##                                      23 
##                                pet_shop 
##                                      23 
##                fashion_bags_accessories 
##                                      21 
##                             electronics 
##                                      20 
##                              stationery 
##                                      19 
##         construction_tools_construction 
##                                      15 
##                        small_appliances 
##                                      15 
##                           watches_gifts 
##                                      14 
##                   furniture_living_room 
##                                      11 
##                  books_general_interest 
##                                      10 
##                       home_construction 
##                                      10 
##                          consoles_games 
##                                       9 
##                         fixed_telephony 
##                                       9 
##                         home_appliances 
##                                       9 
##                       home_appliances_2 
##                                       9 
##                     musical_instruments 
##                                       9 
##               construction_tools_lights 
##                                       8 
## kitchen_dining_laundry_garden_furniture 
##                                       8 
##                        office_furniture 
##                                       7 
##                        air_conditioning 
##                                       6 
##                              food_drink 
##                                       6 
##                  signaling_and_security 
##                                       6 
##               construction_tools_safety 
##                                       5 
##                                  drinks 
##                                       5 
##                     luggage_accessories 
##                                       5 
##                costruction_tools_garden 
##                                       4 
##                            market_place 
##                                       4 
##              agro_industry_and_commerce 
##                                       3 
##                       furniture_bedroom 
##                                       3 
##                                     art 
##                                       2 
##                                   audio 
##                                       2 
##                         books_technical 
##                                       2 
##                      christmas_supplies 
##                                       2 
##                 costruction_tools_tools 
##                                       2 
##                     diapers_and_hygiene 
##                                       2 
##                  fashio_female_clothing 
##                                       2 
##                   fashion_male_clothing 
##                                       2 
##                           fashion_sport 
##                                       2 
##                 fashion_underwear_beach 
##                                       2 
##                                    food 
##                                       1 
##                            home_confort 
##                                       1 
##                              la_cuisine 
##                                       1 
##                          party_supplies 
##                                       1 
##   small_appliances_home_oven_and_coffee 
##                                       1
# auto          garden_tools         health_beauty        sports_leisure       furniture_decor 
#   13                    12                    10                     8                     7 
# baby computers_accessories     home_appliances_2              pet_shop            cool_stuff 
#                     6                     6                     6                     6                     5 
R <- re
R2 <- R[,c(2, 4, 5)] 
R2 <- left_join(R2, I[,c(1,4)])
## Joining, by = "order_id"
R2 <- left_join(R2[,c(4,1,2,3)], ZZ[,c(1,14)])
## Joining, by = "seller_id"
R21 <- R2 %>% filter(grp == 1)
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

docs <- Corpus(VectorSource(R21$review_comment_message))
                                         # 將內容以語料庫形式儲存
docs <- tm_map(docs, content_transformer(tolower)) 
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeNumbers)     #移除數字
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
docs <- tm_map(docs, removeWords,stopwords("portuguese")) # 移除常見的轉折詞彙
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("portuguese")):
## transformation drops documents
docs <- tm_map(docs, removePunctuation) #移除標點符號
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
docs <- tm_map(docs, stripWhitespace)  # 移除額外的空白
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d<- data.frame(word=names(v), freq=v)
head(d,50)
##                word freq
## produto     produto 4698
## prazo         prazo 2209
## entrega     entrega 1762
## recebi       recebi 1709
## chegou       chegou 1596
## antes         antes 1567
## bom             bom 1203
## recomendo recomendo 1098
## veio           veio 1078
## entregue   entregue 1075
## comprei     comprei 1050
## qualidade qualidade  881
## bem             bem  854
## ainda         ainda  672
## loja           loja  671
## compra       compra  610
## gostei       gostei  591
## tudo           tudo  568
## produtos   produtos  493
## super         super  486
## ótimo         ótimo  475
## apenas       apenas  464
## pedido       pedido  461
## excelente excelente  456
## boa             boa  424
## dia             dia  398
## dentro       dentro  384
## rápida       rápida  384
## relógio     relógio  353
## nao             nao  339
## lannister lannister  337
## site           site  313
## dois           dois  310
## porém         porém  310
## pois           pois  305
## agora         agora  290
## sempre       sempre  285
## ser             ser  280
## conforme   conforme  278
## pra             pra  278
## lindo         lindo  272
## correios   correios  268
## nota           nota  259
## duas           duas  258
## ótima         ótima  254
## rápido       rápido  249
## perfeito   perfeito  245
## parabéns   parabéns  234
## dias           dias  233
## adorei       adorei  232
wordcloud(words=d$word,freq=d$freq,min.freq=2,
                     max.words=150,random.order=FALSE,rot.per=0.35,
                     colors=brewer.pal(35,"Dark2"))
## Warning in brewer.pal(35, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors

Lo$geolocation_lat <- round(Lo$geolocation_lat,3)
Lo$geolocation_lng <- round(Lo$geolocation_lng,3)
unique(sell$seller_zip_code_prefix) %>% length() ### 2246
## [1] 2246
names(Lo)[1] <- "seller_zip_code_prefix"
sell$seller_zip_code_prefix %>% unique() %>% length() 
## [1] 2246
sell_lo <- left_join(sell, Lo, by = "seller_zip_code_prefix")
sell_lo2 <- sell_lo %>%  group_by(seller_id, seller_zip_code_prefix,  geolocation_state) %>% summarise(lat = mean(geolocation_lat), lng = mean(geolocation_lng))
sell_lo2 <- sell_lo2[!duplicated(sell_lo2$seller_id),]
sell_lo2 <- left_join(sell_lo2, ZZ[,c(1, 5, 10, 14)])
## Joining, by = "seller_id"
table(sell_lo2$grp)
## 
##    1    2    3    4    5 
##   23 1123 1526  138  285
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(Imap)
library(maps)
Brazil <- map_data("world")%>%filter(region=="Brazil")
ggplot() +
  geom_polygon(data = Brazil, aes(x= long, y= lat, group= group), fill="black")+
  geom_point(data= sell_lo2 , aes(x= lng, y= lat, color= grp), size = 2, alpha = 0.2)
## Warning: Removed 7 rows containing missing values (geom_point).

sell_lo22 <- sell_lo2 %>% filter(grp== 5)
ggplot() +
  geom_polygon(data = Brazil, aes(x= long, y= lat, group= group), fill="black") + geom_point(data= sell_lo22,aes(x= lng, y= lat, color= grp), size = 2, alpha = 0.2)
## Warning: Removed 1 rows containing missing values (geom_point).

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.